# Always print this out before your assignment
sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods  
## [7] base     
## 
## other attached packages:
## [1] knitr_1.36
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.28   R6_2.5.1        jsonlite_1.7.2  magrittr_2.0.1 
##  [5] evaluate_0.14   rlang_0.4.11    stringi_1.7.5   jquerylib_0.1.4
##  [9] bslib_0.3.1     rmarkdown_2.10  tools_4.1.1     stringr_1.4.0  
## [13] xfun_0.25       yaml_2.2.1      fastmap_1.1.0   compiler_4.1.1 
## [17] htmltools_0.5.2 sass_0.4.0
getwd()
## [1] "/Users/angpham/Desktop/CPSC_Courses/MGSC310/final_project"
# Load all your libraries in this chunk 
library('tidyverse')
library('dplyr')
library('ggplot2')
library('forcats')
library('rsample')
library('glmnet')
library('glmnetUtils')
# NOTE: Do not run install.packages() inside a code chunk. Install them in the console outside of a code chunk. 

Question 1: How can audio features help predict the popularity score of a track?

Load Dataset

tracks <- read.csv(here::here("spotify_dataset", "spotify_tracks.csv"))

Clean Dataset & Summary Statistics

The follwing link details each of the features in the ‘tracks’ dataset: https://developer.spotify.com/documentation/web-api/reference/#/operations/get-several-tracks)

tracks %>% glimpse()
## Rows: 101,939
## Columns: 32
## $ X                 <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, …
## $ acousticness      <dbl> 0.294, 0.863, 0.750, 0.763, 0.770, 0.971, …
## $ album_id          <chr> "0D3QufeCudpQANOR7luqdr", "1bcqsH5UyTBzmh9…
## $ analysis_url      <chr> "https://api.spotify.com/v1/audio-analysis…
## $ artists_id        <chr> "['3mxJuHRn2ZWD5OofvJtDZY']", "['4xWMewm6C…
## $ available_markets <chr> "['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG'…
## $ country           <chr> "BE", "BE", "BE", "BE", "BE", "BE", "BE", …
## $ danceability      <dbl> 0.698, 0.719, 0.466, 0.719, 0.460, 0.367, …
## $ disc_number       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ duration_ms       <dbl> 235584, 656960, 492840, 316578, 558880, 18…
## $ energy            <dbl> 0.6060, 0.3080, 0.9310, 0.1260, 0.9420, 0.…
## $ href              <chr> "https://api.spotify.com/v1/tracks/5qljLQu…
## $ id                <chr> "5qljLQuKnNJf4F4vfxQB0V", "3VAX2MJdmdqARLS…
## $ instrumentalness  <dbl> 0.00000269, 0.00000000, 0.00000000, 0.0000…
## $ key               <dbl> 10, 6, 4, 3, 7, 11, 10, 3, 5, 5, 6, 0, 4, …
## $ liveness          <dbl> 0.1510, 0.2530, 0.9380, 0.1130, 0.9170, 0.…
## $ loudness          <dbl> -7.447, -10.340, -13.605, -20.254, -13.749…
## $ lyrics            <chr> "\n\nPerhaps I am bound to be restless\nAl…
## $ mode              <dbl> 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, …
## $ name              <chr> "Blood", "The Ugly Duckling", "Jimmy Launc…
## $ playlist          <chr> "Hipsteribrunssi", "Animal Stories", "Best…
## $ popularity        <dbl> 28, 31, 31, 14, 32, 45, 0, 26, 17, 29, 47,…
## $ preview_url       <chr> "https://p.scdn.co/mp3-preview/1b05a902da3…
## $ speechiness       <dbl> 0.0262, 0.9220, 0.9440, 0.9380, 0.9430, 0.…
## $ tempo             <dbl> 115.018, 115.075, 79.565, 112.822, 81.260,…
## $ time_signature    <dbl> 4, 3, 4, 3, 4, 4, 3, 4, 4, 4, 4, 4, 3, 4, …
## $ track_href        <chr> "https://api.spotify.com/v1/tracks/5qljLQu…
## $ track_name_prev   <chr> "track_14", "track_3", "track_4", "track_9…
## $ track_number      <dbl> 1, 3, 4, 1, 2, 8, 2, 11, 6, 6, 1, 3, 12, 5…
## $ uri               <chr> "spotify:track:5qljLQuKnNJf4F4vfxQB0V", "s…
## $ valence           <dbl> 0.6220, 0.5890, 0.0850, 0.5330, 0.0906, 0.…
## $ type              <chr> "track", "track", "track", "track", "track…

Below is a summary of the relevant features that will be used for this question. This includes genral information about a track, popularity, and its audio features:
- id: ID of th track, as identified by Spotify
- name: Name of track
- href: Link to Spotify API for complete information about a track
- uri: Unique identifier to access track on Spotify
- artists_id: List of artists IDs for a track
- album_id: Album ID of the album the track is a part of
- duration_ms: Length of track
- popularity: Popularity score for a track based on Spotify algorithms (Spotify indicates that it is largely based on number of plays.)
- acousticness: Confidence measure from 0.0 to 1.0 of whether the track is acoustic
- danceability: How suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity
- energy: Measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity
- instrumentalness: Predicts whether a track contains no vocals
- key: Key the track is in. Integers map to pitches using standard Pitch Class notation
- liveness: Detects the presence of an audience in the recording
- loudness: Quality of a sound that is the primary psychological correlate of physical strength (amplitude), measured in decibels (dB)
- mode: Modality (major or minor) of a track
- speechiness: Presence of spoken words in a track
- tempo: Speed or pace of a given piece in beats per minute (BPM)
- time_signature: Notational convention to specify how many beats are in each bar (or measure)
- valence: Musical positiveness conveyed by a track

tracks_clean <- na.omit(tracks) # Omit any tracks with an na value in any column

tracks_clean <- tracks[, c("id", "name", "href", "uri", "artists_id", "album_id", "duration_ms", "popularity", "acousticness", "danceability", "energy", "instrumentalness", "key", "liveness", "loudness", "mode", "speechiness", "tempo", "time_signature", "valence")]

tracks_clean %>% glimpse()
## Rows: 101,939
## Columns: 20
## $ id               <chr> "5qljLQuKnNJf4F4vfxQB0V", "3VAX2MJdmdqARLSU…
## $ name             <chr> "Blood", "The Ugly Duckling", "Jimmy Launch…
## $ href             <chr> "https://api.spotify.com/v1/tracks/5qljLQuK…
## $ uri              <chr> "spotify:track:5qljLQuKnNJf4F4vfxQB0V", "sp…
## $ artists_id       <chr> "['3mxJuHRn2ZWD5OofvJtDZY']", "['4xWMewm6CY…
## $ album_id         <chr> "0D3QufeCudpQANOR7luqdr", "1bcqsH5UyTBzmh9Y…
## $ duration_ms      <dbl> 235584, 656960, 492840, 316578, 558880, 183…
## $ popularity       <dbl> 28, 31, 31, 14, 32, 45, 0, 26, 17, 29, 47, …
## $ acousticness     <dbl> 0.294, 0.863, 0.750, 0.763, 0.770, 0.971, 0…
## $ danceability     <dbl> 0.698, 0.719, 0.466, 0.719, 0.460, 0.367, 0…
## $ energy           <dbl> 0.6060, 0.3080, 0.9310, 0.1260, 0.9420, 0.3…
## $ instrumentalness <dbl> 0.00000269, 0.00000000, 0.00000000, 0.00000…
## $ key              <dbl> 10, 6, 4, 3, 7, 11, 10, 3, 5, 5, 6, 0, 4, 5…
## $ liveness         <dbl> 0.1510, 0.2530, 0.9380, 0.1130, 0.9170, 0.6…
## $ loudness         <dbl> -7.447, -10.340, -13.605, -20.254, -13.749,…
## $ mode             <dbl> 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1…
## $ speechiness      <dbl> 0.0262, 0.9220, 0.9440, 0.9380, 0.9430, 0.0…
## $ tempo            <dbl> 115.018, 115.075, 79.565, 112.822, 81.260, …
## $ time_signature   <dbl> 4, 3, 4, 3, 4, 4, 3, 4, 4, 4, 4, 4, 3, 4, 4…
## $ valence          <dbl> 0.6220, 0.5890, 0.0850, 0.5330, 0.0906, 0.1…

summary(tracks_clean)
##       id                name               href          
##  Length:101939      Length:101939      Length:101939     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##      uri             artists_id          album_id        
##  Length:101939      Length:101939      Length:101939     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##   duration_ms        popularity     acousticness     danceability  
##  Min.   :   1155   Min.   : 0.00   Min.   :0.0000   Min.   :0.000  
##  1st Qu.: 184000   1st Qu.:29.00   1st Qu.:0.0407   1st Qu.:0.480  
##  Median : 216893   Median :41.00   Median :0.2380   Median :0.610  
##  Mean   : 246771   Mean   :39.78   Mean   :0.3521   Mean   :0.586  
##  3rd Qu.: 261055   3rd Qu.:52.00   3rd Qu.:0.6450   3rd Qu.:0.714  
##  Max.   :5505831   Max.   :97.00   Max.   :0.9960   Max.   :0.989  
##      energy       instrumentalness         key        
##  Min.   :0.0000   Min.   :0.0000000   Min.   : 0.000  
##  1st Qu.:0.4110   1st Qu.:0.0000000   1st Qu.: 2.000  
##  Median :0.6290   Median :0.0000375   Median : 5.000  
##  Mean   :0.5865   Mean   :0.1487759   Mean   : 5.271  
##  3rd Qu.:0.7980   3rd Qu.:0.0344000   3rd Qu.: 8.000  
##  Max.   :1.0000   Max.   :1.0000000   Max.   :11.000  
##     liveness         loudness            mode         speechiness    
##  Min.   :0.0000   Min.   :-60.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0956   1st Qu.:-11.149   1st Qu.:0.0000   1st Qu.:0.0364  
##  Median :0.1240   Median : -7.599   Median :1.0000   Median :0.0506  
##  Mean   :0.1976   Mean   : -9.463   Mean   :0.6182   Mean   :0.1288  
##  3rd Qu.:0.2410   3rd Qu.: -5.509   3rd Qu.:1.0000   3rd Qu.:0.1040  
##  Max.   :0.9990   Max.   :  2.719   Max.   :1.0000   Max.   :0.9690  
##      tempo        time_signature     valence      
##  Min.   :  0.00   Min.   :0.000   Min.   :0.0000  
##  1st Qu.: 95.97   1st Qu.:4.000   1st Qu.:0.2710  
##  Median :118.07   Median :4.000   Median :0.4770  
##  Mean   :118.36   Mean   :3.876   Mean   :0.4828  
##  3rd Qu.:136.04   3rd Qu.:4.000   3rd Qu.:0.6930  
##  Max.   :244.03   Max.   :5.000   Max.   :0.9930

Below is a histogram showing the spread of the ‘popularity’ variable. From the plot below, we can see that the ‘popularity’ variable is normally distributed from 0 - 100, with 100 indicating most popular. The ‘popularity’ variable is not ordered by rank. Spotify indicates it largely based on number of plays, but includes other factors as well.

ggplot(tracks_clean, aes(popularity)) + geom_histogram(binwidth = 5) + theme_minimal()

The following plots show relationships between the popularity scores and each audio feature. The plots are not linear; however, some plots, like danceability and energy clearly show that higher values in those audio features indicate more popular music, as indicated by darker points in the top right hand corner.

ggplot(tracks_clean, aes(acousticness, popularity)) + geom_point(alpha = 0.05, color = "#5F4690") + geom_smooth() + theme_minimal() 

ggplot(tracks_clean, aes(danceability, popularity)) + geom_point(alpha = 0.05, color = "#1D6996") + geom_smooth() + theme_minimal()

ggplot(tracks_clean, aes(energy, popularity)) + geom_point(alpha = 0.05, color = "#38A6A5") + geom_smooth() + theme_minimal()

ggplot(tracks_clean, aes(instrumentalness, popularity)) + geom_point(alpha = 0.05, color = "#0F8554") + geom_smooth() + theme_minimal()

ggplot(tracks_clean, aes(key, popularity)) + geom_point(alpha = 0.05, color = "#73AF48") + geom_smooth() + theme_minimal()

ggplot(tracks_clean, aes(liveness, popularity)) + geom_point(alpha = 0.05, color = "#EDAD08") + geom_smooth() + theme_minimal()

ggplot(tracks_clean, aes(loudness, popularity)) + geom_point(alpha = 0.05, color = "#E17C05") + geom_smooth() + theme_minimal()

ggplot(tracks_clean, aes(mode, popularity)) + geom_point(alpha = 0.05, color = "#6F4070") + geom_smooth() + theme_minimal()

ggplot(tracks_clean, aes(speechiness, popularity)) + geom_point(alpha = 0.05, color = "#94346E") + geom_smooth() + theme_minimal()

ggplot(tracks_clean, aes(tempo, popularity)) + geom_point(alpha = 0.05, color = "#CC503E") + geom_smooth() + theme_minimal()

ggplot(tracks_clean, aes(time_signature, popularity)) + geom_point(alpha = 0.05, color = "#994E95") + geom_smooth() +theme_minimal()

ggplot(tracks_clean, aes(valence, popularity)) + geom_point(alpha = 0.05, color = "#1DB954FF") + geom_smooth() +theme_minimal()

Linear Regression Model

# Split data
tracks_split <- initial_split(tracks_clean, prop = 0.70) # 70%-30% train-test split
tracks_train <- training(tracks_split)
tracks_test <- testing(tracks_split)
# Create model
lr_simple <- lm(popularity ~ acousticness + danceability + energy + instrumentalness + key + liveness + loudness + mode + 
                speechiness + tempo + time_signature + valence, 
                data = tracks_train)

# Summarize model
summary(lr_simple)
## 
## Call:
## lm(formula = popularity ~ acousticness + danceability + energy + 
##     instrumentalness + key + liveness + loudness + mode + speechiness + 
##     tempo + time_signature + valence, data = tracks_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -47.309 -10.897  -0.133  10.972  62.215 
## 
## Coefficients:
##                    Estimate Std. Error t value             Pr(>|t|)
## (Intercept)       39.019192   0.720172  54.180 < 0.0000000000000002
## acousticness      -0.616999   0.259494  -2.378              0.01742
## danceability      10.923183   0.430506  25.373 < 0.0000000000000002
## energy            -1.862048   0.461447  -4.035           0.00005460
## instrumentalness  -0.709679   0.230822  -3.075              0.00211
## key               -0.027454   0.016596  -1.654              0.09807
## liveness           1.701168   0.360877   4.714           0.00000243
## loudness           0.449215   0.017860  25.151 < 0.0000000000000002
## mode               0.007194   0.122794   0.059              0.95329
## speechiness      -23.611448   0.333765 -70.743 < 0.0000000000000002
## tempo              0.002754   0.002015   1.367              0.17171
## time_signature     1.322444   0.119332  11.082 < 0.0000000000000002
## valence           -5.318778   0.287457 -18.503 < 0.0000000000000002
##                     
## (Intercept)      ***
## acousticness     *  
## danceability     ***
## energy           ***
## instrumentalness ** 
## key              .  
## liveness         ***
## loudness         ***
## mode                
## speechiness      ***
## tempo               
## time_signature   ***
## valence          ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.63 on 71344 degrees of freedom
## Multiple R-squared:  0.139,  Adjusted R-squared:  0.1388 
## F-statistic: 959.7 on 12 and 71344 DF,  p-value: < 0.00000000000000022
preds_train_lr_simple <- predict(lr_simple, newdata = tracks_train)
preds_test_lr_simple <- predict(lr_simple, newdata = tracks_test)
results <- data.frame(
              `preds` = c(preds_test_lr_simple, preds_train_lr_simple),
              `true` = c(tracks_test$popularity, tracks_train$popularity),
              `type` = c(rep("Test", nrow(tracks_test)), rep("Train", nrow(tracks_train))))

results

ggplot(results, aes(true, preds)) + geom_point(alpha = 0.05) + 
  facet_wrap(. ~ type) + 
  theme_minimal() + 
  labs(x = "True Track's Popularity", 
       y = "Predicted Tracks's Popularity")

get_rmse <- function(true_values, predictions) {
  sqrt(mean((true_values - predictions) ^ 2))
}

get_mae <- function(true_values, predictions){
  mean(abs(true_values - predictions))
}
get_rmse(tracks_train$popularity, preds_train_lr_simple)
## [1] 15.62576
get_rmse(tracks_test$popularity, preds_test_lr_simple)
## [1] 15.57006
get_mae(tracks_train$popularity, preds_train_lr_simple)
## [1] 12.65871
get_mae(tracks_test$popularity, preds_test_lr_simple)
## [1] 12.61185

Ridge Model

ridge_fit <- cv.glmnet(popularity ~ acousticness + danceability + energy + instrumentalness + key + liveness 
                       + loudness + mode + speechiness + tempo + time_signature + valence,
                       data = tracks_train, alpha = 0)
print(ridge_fit)
## Call:
## cv.glmnet.formula(formula = popularity ~ acousticness + danceability + 
##     energy + instrumentalness + key + liveness + loudness + mode + 
##     speechiness + tempo + time_signature + valence, data = tracks_train, 
##     alpha = 0)
## 
## Model fitting options:
##     Sparse model matrix: FALSE
##     Use model.frame: FALSE
##     Number of crossvalidation folds: 10
##     Alpha: 0
##     Deviance-minimizing lambda: 0.5189662  (+1 SE): 3.039596

plot(ridge_fit)


coef(ridge_fit, s = "lambda.min")
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                             s1
## (Intercept)       38.570159373
## acousticness      -0.662433093
## danceability      10.189906148
## energy            -1.276831247
## instrumentalness  -0.736573538
## key               -0.027030431
## liveness           1.249890299
## loudness           0.422609024
## mode               0.008885713
## speechiness      -22.809851155
## tempo              0.002753637
## time_signature     1.349099133
## valence           -4.941543881

Lasso Model

lasso_fit <- cv.glmnet(popularity ~ acousticness + danceability + energy + instrumentalness + key + liveness 
                       + loudness + mode + speechiness + tempo + time_signature + valence,
                       data = tracks_train, alpha = 1)

print(lasso_fit)
## Call:
## cv.glmnet.formula(formula = popularity ~ acousticness + danceability + 
##     energy + instrumentalness + key + liveness + loudness + mode + 
##     speechiness + tempo + time_signature + valence, data = tracks_train, 
##     alpha = 1)
## 
## Model fitting options:
##     Sparse model matrix: FALSE
##     Use model.frame: FALSE
##     Number of crossvalidation folds: 10
##     Alpha: 1
##     Deviance-minimizing lambda: 0.008457856  (+1 SE): 0.4209495

plot(lasso_fit)


coef(lasso_fit, s = "lambda.min")
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                             s1
## (Intercept)       38.884372839
## acousticness      -0.529384838
## danceability      10.858629158
## energy            -1.603181170
## instrumentalness  -0.705714089
## key               -0.025352048
## liveness           1.580783603
## loudness           0.443180756
## mode               .          
## speechiness      -23.568684213
## tempo              0.002464728
## time_signature     1.311516476
## valence           -5.276537779

ElasticNet Model

enet_mod <- cva.glmnet(popularity ~ acousticness + danceability + energy + instrumentalness + key + liveness 
                       + loudness + mode + speechiness + tempo + time_signature + valence, 
                       data = tracks_train, 
                       alpha = seq(0, 1, by = 0.05))

print(enet_mod)
## Call:
## cva.glmnet.formula(formula = popularity ~ acousticness + danceability + 
##     energy + instrumentalness + key + liveness + loudness + mode + 
##     speechiness + tempo + time_signature + valence, data = tracks_train, 
##     alpha = seq(0, 1, by = 0.05))
## 
## Model fitting options:
##     Sparse model matrix: FALSE
##     Use model.frame: FALSE
##     Alpha values: 0 0.05 0.1 0.15 0.2 0.25 0.3 0.35 0.4 0.45 0.5 0.55 0.6 0.65 0.7 0.75 0.8 0.85 0.9 0.95 1
##     Number of crossvalidation folds for lambda: 10

minlossplot(enet_mod, cv.type = "min")

# Function to find the best alpha
get_alpha <- function(fit) {
  alpha <- fit$alpha
  error <- sapply(fit$modlist, 
                  function(mod) {min(mod$cvm)})
  alpha[which.min(error)]
}
# Extract the best alpha value
best_alpha <- get_alpha(enet_mod)
print(best_alpha)
## [1] 0.8

# Create best Elastic Net model
best_enet_mod <- cva.glmnet(popularity ~ acousticness + danceability + energy + instrumentalness + key + liveness 
                       + loudness + mode + speechiness + tempo + time_signature + valence, 
                       data = tracks_train, 
                       alpha = best_alpha)

print(best_enet_mod)
## Call:
## cva.glmnet.formula(formula = popularity ~ acousticness + danceability + 
##     energy + instrumentalness + key + liveness + loudness + mode + 
##     speechiness + tempo + time_signature + valence, data = tracks_train, 
##     alpha = best_alpha)
## 
## Model fitting options:
##     Sparse model matrix: FALSE
##     Use model.frame: FALSE
##     Alpha values: 0.8
##     Number of crossvalidation folds for lambda: 10